# Figure_3.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file produces Figure 3 in the article: "Effects of education on 
# potential mechanisms."



if (interactive()) {
  ONE_COLUMN_ESTIMATOR    <- 'OLS'  # if just one column, depict OLS or IV? 
  SHORT_PANEL             <- FALSE  # plot a short panel or one that takes up the page?
} else {
  clArgs <- commandArgs(trailingOnly = TRUE)
  if (length(clArgs) <= 0) { 
    writeLines(paste0(
      "clArgs has length ", length(clArgs), 
      ".  This length is less than 1, so the script is stopping."))
    stop()
  }
  ONE_COLUMN_ESTIMATOR    <- clArgs[1] 
  SHORT_PANEL             <- as.logical(clArgs[2]) 
}
library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))       # for qw(), rescale()
library(dplyr)         # for %>%, mutate(), to copy df and change cols. in one go
library(grid)          
library(Hmisc)         # for Dotplot()
library(ivpack)        # for clustered SEs
library(lattice)
library(multiwayvcov)  # for cluster.vcov()
library(tibble)        # for tibble()

source('IV_setup.R')
source('mechanismsVarInfo.R')
source('functions/estimateModels.R')
source('float_code/dotplotParameters.R')
source("functions/getClusters.R")
dirOutput    <- 'float_output/'
filenameStem <- 'Figure_3'
PDF_title    <- 'Figure 3: Potential mediators of the effects of education on economic attitudes'   



##############################################################################
# PRELIMINARIES FOR DRAWING THE FIGURE
##############################################################################
# createMechanismsVarInfoDF() creates a data frame with one row per variable.
# The columns are the variable names, the names of the corresponding data 
# frames, the human-readable descriptions of the variables (which are labels
# that appear in the tables and figures), and so on.  
varInfoDF <- createMechanismsVarInfoDF()
PS_width      <- 9
PS_height     <- 9
baseCexSize   <- 1.0 
axisTickSize  <- .4
xAxisTextSize <- .690 * baseCexSize
if (ONE_COLUMN_ESTIMATOR=='OLS') {
  xlistAt     <- seq(0, .06, by = .02)
  xlistLabels <- qw("0 .02 .04 .06")
  xlistLimits <- c(-.015, .075)
} else {
  xlistAt       <- seq(-.03, .09, by = .03)  
  xlistLabels   <- qw("-.03 0 .03 .06 .09")  
  xlistLimits   <- c(-.06, .12)
}
panelHeight      <- if (SHORT_PANEL) list(3.45, "inches") else list(4.50, "inches")
panelWidth       <- list(2.20, "inches") 
panelLayout      <- c(1, 1)  # columns, then rows
stripHeight      <- list(lines = 1.33, lineheight = 14) # lineheight is space between lines
stripText        <- NULL
panelBorderWidth <- .3
nPanels          <- panelLayout[1] * panelLayout[2] 
xBetween         <- 1.2000   # space between columns
yBetween         <- 3.5000   # space between rows
stripTextSize    <- 0.700 * baseCexSize 
xLabSize         <- 0.900 * baseCexSize 
yLabSize         <- xLabSize 
xList <- list(
  draw        = TRUE,   
  limits      = xlistLimits,
  labels      = xlistLabels,  
  at          = xlistAt,  
  tck         = c(axisTickSize, 0), 
  col         = panelBorderCol, 
  cex         = xAxisTextSize,
  alternating = 1, 
  relation    = 'free', 
  axs         = 'i')  # axs='i' means that there is no padding around the xlimits
yAxisTextSize    <- 0.720 * baseCexSize
yList <- list(
  draw        = TRUE,
  limits      = if (SHORT_PANEL) c(.3, length(varInfoDF$yLabels) + .84) else c(.5, length(varInfoDF$yLabels) + .6),
  labels      = varInfoDF$yLabels,
  at          = length(varInfoDF$yLabels):1,
  tck         = 0,
  col         = panelBorderCol,
  cex         = yAxisTextSize,
  alternating = c(1, 1),
  relation    = 'same',  
  axs         = 'i')




##############################################################################
# REVISE MODEL(S)
##############################################################################
# Eliminate all models but the baseline model.
rm(list = ls(pat='mod[2-9]', env = IVModelsEnv), envir = IVModelsEnv)


# Create a model suitable for OLS estimation, using the normal mechanisms 
# variables. This is just the "OLS version of the IV model"; it is the second
# stage of the IV model, except that there is no instrumenting for years of
# education.
LMModelsEnv <- eapply(IVModelsEnv, function (x) {
  tmp <- getLMFormula(x)
  attributes(tmp)$modNum <- attributes(x)$modNum
  tmp
})
LMModelsEnv <- as.environment(LMModelsEnv)


# Create a restricted model that does not control for state of interview.
LMModelsEnv_oneYearOnly <- new.env()
LMModelsEnv_oneYearOnly$eqwlth.mod1 <- update(
  LMModelsEnv$eqwlth.mod1,
  . ~ . - yearInt.fac | . - yearInt.fac )
attributes(LMModelsEnv_oneYearOnly$eqwlth.mod1)$modNum <- 1



##############################################################################
# RECODE VARIABLES
##############################################################################
# CHANGE YEAR-OF-INTERVIEW FIXED EFFECTS FOR SOME VARIABLES
# To estimate the models with controls for age, year at age 14, and year of 
# interview, I collapse two categories in the year-of-interview factor 
# variable. Normally, I collapse 1994 and 1996: see note 10 in the article.
# But for some of the mechanisms variables, a different coding is necessary.
GSS.df.8990 <- mutate(GSS.df,  yearInt.fac = Recode(yearInt, '1989=1990', as.factor = TRUE))
varInfoDF$dfNamesMech[grepl('noPov|prestigeMobility', varInfoDF$varNamesMech)] <- 'GSS.df.8990'


# RESCALE THE MECHANISMS VARIABLES
for (i in 1:length(varInfoDF$varNamesMech)) {
  varName <- varInfoDF$varNamesMech[i]
  dfName  <- varInfoDF$dfNamesMech[i]
  mechDF  <- get(dfName) 
  mechVar <- get(varName, envir = as.environment(mechDF))
  if ('logical' %in% class(mechVar)) { next }  # skip boolean variables
  mechVar <- Bullock::rescale(mechVar)  
  mechDF[, varName] <- mechVar
  assign(dfName, mechDF)
}



##############################################################################
# ESTIMATE MODELS
##############################################################################
IVModelObjectsEnvMechs <- estimateModels(  
  varNames     = varInfoDF$varNamesMech[!varInfoDF$OLS_only], 
  modelEnvir   = IVModelsEnv, 
  dfNames      = varInfoDF$dfNamesMech[!varInfoDF$OLS_only],
  objectSuffix = '.IV') 
LMModelObjectsEnvMechs <- estimateModels(  
  varNames     = varInfoDF$varNamesMech[!varInfoDF$oneYearOnly], 
  modelEnvir   = LMModelsEnv, 
  dfNames      = varInfoDF$dfNamesMech[!varInfoDF$oneYearOnly],
  objectSuffix = '.lm',
  estFun       = lm) 

# In varInfoDF, five variables are listed as being asked in a single year.
# A few of those variables may seem to have been asked in more than one year.
# For example, if you inspect ANES.df, eqOppExists may seem to have been asked 
# in 1972, 1974, and 1976. But this appearance is misleading. It arises 
# because members of the 1972-74-76 ANES panel have multiple cases (rows) in
# the cumulative ANES dataset. In IV_setup.R, I aggregate each respondent's
# multiple records into one record, so there is no sense in which some
# respondents are double-counted in other analyses. But in this analysis, the 
# code below helps to ensure that we don't control for year of interview when
# analyzing questions that are asked in only one year.  [2019 07 24]
LMModelObjectsEnvMechs_oneYearOnly <- estimateModels(  
  varNames     = varInfoDF$varNamesMech[varInfoDF$oneYearOnly], 
  modelEnvir   = LMModelsEnv, 
  dfNames      = varInfoDF$dfNamesMech[varInfoDF$oneYearOnly],
  objectSuffix = '.lm',
  estFun       = lm) 


IVClusters <- lapply(
  IVModelObjectsEnvMechs,
  function (x) { 
    modelRHS <- as.character(x$formula[3]) 
    if (grepl('factor(yearYoungNorm)', modelRHS, fixed = TRUE)) { 
      droplevels(x$model$stateYoung:x$model$'factor(yearYoungNorm)')
    } else {
      droplevels(x$model$stateYoung:factor(x$model$yearYoungNorm))
    }
  }
)

LMClusters  <- lapply(LMModelObjectsEnvMechs, getClusters)
LMModelVcovClustered <- mapply(
  FUN     = cluster.vcov,
  model   = as.list(LMModelObjectsEnvMechs),
  cluster = LMClusters)

LMClusters_oneYearOnly  <- lapply(LMModelObjectsEnvMechs_oneYearOnly, getClusters)
LMModelVcovClustered_oneYearOnly <- mapply(
  FUN     = cluster.vcov,
  model   = as.list(LMModelObjectsEnvMechs_oneYearOnly),
  cluster = LMClusters_oneYearOnly)



##############################################################################
# CALCULATE N
##############################################################################
n_IV  <- sapply(IVModelObjectsEnvMechs, nobs) %>%
  tibble(
    depVar = factor(
      x      = gsub('(.*)\\..*', '\\1', names(.)), 
      levels = varInfoDF$varNamesMech),
    N      = .)
n_OLS  <- sapply(LMModelObjectsEnvMechs, nobs) %>%
  tibble(
    depVar = factor(
      x      = gsub('(.*)\\..*', '\\1', names(.)), 
      levels = varInfoDF$varNamesMech),
    N      = .)
n_OLS_oneYearOnly  <- sapply(LMModelObjectsEnvMechs_oneYearOnly, nobs) %>%
  tibble(
    depVar = factor(
      x      = gsub('(.*)\\..*', '\\1', names(.)), 
      levels = varInfoDF$varNamesMech),
    N      = .)



##############################################################################
# CREATE DATA FRAME WITH DATA TO PLOT
##############################################################################
dataToPlotIV <- expand.grid(
  lower     = NA, 
  pred      = NA, 
  upper     = NA,
  SE        = NA,
  depVar    = varInfoDF$varNamesMech,
  panel     = 'IV')
dataToPlotOLS             <- mutate(dataToPlotIV,  panel = 'OLS')
dataToPlotOLS_oneYearOnly <- dataToPlotOLS 

for (varName in varInfoDF$varNamesMech) {
  
  # Get the IV estimate and SE for a given variable
  if (! varInfoDF$OLS_only[varInfoDF$varNamesMech == varName]) {
    tmpEduc     <- get(paste0(varName, '.IV1'), IVModelObjectsEnvMechs)
    tmpCluster  <- get(paste0(varName, '.IV1'), IVClusters)
    coefEstEduc <- coeftest(tmpEduc)['educ', 'Estimate']
    coefSEEduc  <- cluster.robust.se(tmpEduc, tmpCluster)['educ', 'Std. Error']  
    dataToPlotIV[dataToPlotIV$depVar == varName, qw("lower pred upper SE")] <- c(
      coefEstEduc - 1.96*coefSEEduc,
      coefEstEduc,
      coefEstEduc + 1.96*coefSEEduc,
      coefSEEduc)
  }
  
  # Get the OLS estimate and SE for a given variable  
  treatmentName  <- 'educ' 
  varNameIndex   <- which(varInfoDF$varNamesMech == varName)
  if (! varInfoDF$oneYearOnly[varNameIndex]) {
    tmpEducOLS     <- get(paste0(varName, '.lm1'), LMModelObjectsEnvMechs)
    tmpEducVcov    <- get(paste0(varName, '.lm1'), LMModelVcovClustered)
    coefEstEducOLS <- coeftest(tmpEducOLS)[treatmentName, 'Estimate']
    coefSEEducOLS  <- coeftest(tmpEducOLS, vcov. = tmpEducVcov)[treatmentName, 'Std. Error']
    dataToPlotOLS[dataToPlotOLS$depVar == varName, qw("lower pred upper SE")] <- c(
      coefEstEducOLS - 1.96*coefSEEducOLS,
      coefEstEducOLS,
      coefEstEducOLS + 1.96*coefSEEducOLS,
      coefSEEducOLS)  
  } else {
    tmpEducOLS     <- get(paste0(varName, '.lm1'), LMModelObjectsEnvMechs_oneYearOnly)
    tmpEducVcov    <- get(paste0(varName, '.lm1'), LMModelVcovClustered_oneYearOnly)
    coefEstEducOLS <- coeftest(tmpEducOLS)[treatmentName, 'Estimate']
    coefSEEducOLS  <- coeftest(tmpEducOLS, vcov. = tmpEducVcov)[treatmentName, 'Std. Error']
    dataToPlotOLS[dataToPlotOLS$depVar == varName, qw("lower pred upper SE")] <- c(
      coefEstEducOLS - 1.96*coefSEEducOLS,
      coefEstEducOLS,
      coefEstEducOLS + 1.96*coefSEEducOLS,
      coefSEEducOLS)      
  }
} 


# ADD SAMPLE-SIZE INFORMATION
dataToPlotIV  <- left_join(dataToPlotIV,  n_IV,  by = "depVar")
dataToPlotOLS <- left_join(dataToPlotOLS, n_OLS, by = "depVar")
for (varnameOneYearOnly in n_OLS_oneYearOnly$depVar) {
  dataToPlotOLS$N[dataToPlotOLS$depVar == varnameOneYearOnly] <- n_OLS_oneYearOnly$N[n_OLS_oneYearOnly$depVar == varnameOneYearOnly]
}


# ADD ROW NUMBERS 
# Assign row numbers in reverse order of varInfoDF$varNamesMech. For example,
# the last variable in that list is assigned to position 1 (the bottom), the
# second-to-last variable in that list is assigned to position (2), and so on.
dataToPlotIV$rowNum  <- length(varInfoDF$varNamesMech):1
dataToPlotOLS$rowNum <- length(varInfoDF$varNamesMech):1

# Put one row of vertical white space between the different tiers of 
# variables that are plotted: basic self-interest, economic individualism,
# and social engagement.  
dataToPlotIV$rowNum[1:07]  <- dataToPlotIV$rowNum[1:07] + 1   # separate the self-interest variables
dataToPlotOLS$rowNum[1:07] <- dataToPlotOLS$rowNum[1:07] + 1  # separate the self-interest variables
yList$at        <- dataToPlotIV$rowNum 
yList$limits[2] <- yList$limits[2] + 1



# MAKE THE "dataToPlot" DATA FRAME
# One is for the OLS estimates. The other is for the IV estimates.
if (ONE_COLUMN_ESTIMATOR=='OLS') {
  dataToPlot    <- dataToPlotOLS
} else if (ONE_COLUMN_ESTIMATOR=='IV') {
  dataToPlot <- dataToPlotIV
}



##############################################################################
# PANEL FUNCTION
##############################################################################
myPanel <- function(pred, SE, N, subscripts, ...) {
  panel.abline(v = 0, col = dotplotLineColor)
  panel.Dotplot(...)

  
  # Tick marks at top of each panel
  panel.axis(
    side        = "top",
    at          = xList$at,
    draw.labels = FALSE,
    half        = FALSE,
    tck         = axisTickSize * 1.0)

    
  # Annotations to right of right-hand panel. 
  pred <- gsub('^-0', '-', round(pred, 2))
  pred <- gsub('^0',  '',  pred)
  pred <- gsub('(\\.\\d)$', '\\10', pred)
  pred[pred == ''] <- '.00'
  SE <- round(SE, 3) %>%
    gsub('^0', '', .) %>%                          # remove leading 0
    str_pad(width=4, side = 'right', pad = 0) %>%  # e.g., change '.03' to '.030'
    paste0('(', ., ')')
  N <- paste0('N\U2006=\U2006', N)
  grid.text(
    x     = 1.03,
    y     = unit(yList$at, "native"),
    label = paste0(pred, ' ', SE, ', ', N),
    hjust = 0,  # 1 = right-aligned
    gp    = gpar(cex = yAxisTextSize))
  
}



##############################################################################
# MAKE THE PDF FILE
##############################################################################
pdf(
  file   = paste0(dirOutput, filenameStem, '.pdf'), 
  width  = PS_width, 
  height = PS_height, 
  paper  = "special", 
  title  = PDF_title,
  bg     = postscriptBackground)
trellis.par.set(
  axis.components = list(left = list(pad1 = .875)),   # distance between axis ticks and tick labels
  axis.line = list( 
    alpha = 1, 
    col   = panelBorderCol, # to eliminate panel border, set col to "white"
    lty   = 1, 
    lwd   = panelBorderWidth),  
  dot.line = list(alpha=1, col=dotplotLineColor, lty=1, lwd=1),
  dot.symbol = list(
    alpha = 1, 
    cex   = .5, 
    col   = dotColor, 
    font  = 1, 
    pch   = 16),        
  plot.line = list(alpha = 1, col = dotColor, lty = 1, lwd = 1),
  superpose.line = list(
    alpha = c(1,1), 
    col   = c('black', plotLineColor), 
    lty   = c("solid", "43"), 
    lwd   = c(1, plotLineWidth)))


# CREATE PDF FILE
mainResultsDotplot <- Dotplot(
  rowNum ~ Cbind(pred, lower, upper) | panel, 
  data     = dataToPlot,
  pred     = dataToPlot$pred,
  SE       = dataToPlot$SE,
  N        = dataToPlot$N,
  panel    = myPanel,
  layout   = panelLayout,
  between  = list(x = xBetween, y = yBetween),                   
  as.table = TRUE,
  strip    = FALSE,
  scales   = list(x = xList, y = yList),
  xlab     = '',
  ylab     = '',
  par.settings = list(clip = list(panel="off", strip="off")))
print(
  x            = mainResultsDotplot, 
  panel.width  = panelWidth, 
  panel.height = panelHeight)


# ADD LABEL THAT IS CENTERED UNDERNEATH THE PANELS
downViewport("plot_01.figure.vp")
figureLabel <- "Marginal effects of a year of education"
grid.text(
  label = figureLabel,
  y     = -.04,
  gp    = gpar(cex = xLabSize)) 


# SAVE AND PROCESS FILE
dev.off()
if (!(Sys.which('pdfcrop')=='' | Sys.which('perl')=='')  |  'pdfcrop' %in% installed.packages()[, 'Package']) {  # if "pdfcrop" is installed 
  system(paste(
    if (Sys.info()['sysname']=='Windows') paste(Sys.getenv('Comspec'), '/c ') else '',
    'pdfcrop', 
    paste0(dirOutput, filenameStem, '.pdf'), 
    paste0(dirOutput, filenameStem, '_crop.pdf')))
  file.remove(paste0(dirOutput, filenameStem, '.pdf'))
  if (interactive() & Sys.info()['sysname']=='Windows') shell.exec(paste0(normalizePath(dirOutput), filenameStem, '_crop.pdf'))
}



##############################################################################
# AUXILIARY RESULTS
##############################################################################
if (interactive()) {

  # COMPARE OLS AND IV ESTIMATES
  # "[W]e can repose some confidence..."
  data.frame(
    depvar = dataToPlotOLS$depVar,
    OLS    = dataToPlotOLS$pred,
    "2SLS" = dataToPlotIV$pred)
  
  
  # GET FIRST-STAGE F STATISTICS FOR IV ESTIMATES
  # "The first-stage F statistics exceed 30 for every IV estimate mentioned in
  # this section."
  Fstats <- sapply(
    X   = IVModelObjectsEnvMechs, 
    FUN = function (x) { summary(x, diagnostics = TRUE)$diagnostics['Weak instruments', 'statistic'] } )
  Fstats
}

